{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 文档加载"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### PDF文档加载"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pprint import pprint\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "# 创建一个 PyPDFLoader Class实例，输入为待加载的pdf文档路径\n",
    "loader = PyPDFLoader(\"../data/GDB中文调试手册.pdf\")\n",
    "pages = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'list'>\n"
     ]
    }
   ],
   "source": [
    "# 查看加载数据\n",
    "print(type(pages))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "41\n"
     ]
    }
   ],
   "source": [
    "print(len(pages))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "GDB 中文调试手册 \n",
      " \n",
      " \n",
      "  \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      " \n",
      "PDF 制版：孙士才 \n",
      "2008-9-8 \n",
      "网络上交换资源，请使用PDF 格式，您用WORD 格式，不是强制别人使用Ｍ$Office 嘛！ \n",
      "（该资源属于网络收集）\n"
     ]
    }
   ],
   "source": [
    "print(pages[0].page_content[:500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'author': 'Administrator',\n",
      " 'creationdate': '2008-09-08T11:18:23+08:00',\n",
      " 'creator': 'pdfFactory Pro www.fineprint.cn',\n",
      " 'moddate': '2020-05-21T12:36:19+08:00',\n",
      " 'page': 0,\n",
      " 'page_label': '1',\n",
      " 'producer': 'pdfFactory Pro 3.17 (Windows XP Professional Chinese)',\n",
      " 'source': '../data/GDB中文调试手册.pdf',\n",
      " 'title': '用GDB调试程序',\n",
      " 'total_pages': 41}\n"
     ]
    }
   ],
   "source": [
    "pprint(pages[0].metadata)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 网页文档加载"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders import WebBaseLoader\n",
    "\n",
    "url = \"https://gitee.com/xaioyang123/chat-server/blob/master/README.md\"\n",
    "header = {\n",
    "    \"user-agent\":\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0\",\n",
    "    \"Accept-Encoding\":\"gzip, deflate, br\",\n",
    "    \"Accept\":\"*/*\"\n",
    "}\n",
    "loader = WebBaseLoader(web_path=url, header_template=header)\n",
    "\n",
    "pages = loader.load()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "langchain_core.documents.base.Document"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(pages[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "('\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'README.md · 予尘简/ChatServer - Gitee.com\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '登录\\n'\n",
      " '注册\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '开源\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '企业版\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '高校版\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '搜索\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '帮助中心\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '使用条款\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '关于我们\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '开源\\n'\n",
      " '企业版\\n'\n",
      " '高校版\\n'\n",
      " '私有云\\n'\n",
      " '\\n'\n",
      " '模力方舟\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '登录\\n'\n",
      " '注册\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '代码拉取完成，页面将自动刷新\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '捐赠\\n'\n",
      " '\\n'\n",
      " '捐赠前请先登录\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '取消\\n'\n",
      " '前往登录\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '扫描微信二维码支付\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '取消\\n'\n",
      " '支付完成\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '支付提示\\n'\n",
      " '\\n'\n",
      " '将跳转至支付宝完成支付\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '确定\\n'\n",
      " '取消\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'Watch\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '不关注\\n'\n",
      " '\\n'\n",
      " '关注所有动态\\n'\n",
      " '\\n'\n",
      " '仅关注版本发行动态\\n'\n",
      " '\\n'\n",
      " '关注但不提醒动态\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '1\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'Star\\n'\n",
      " '0\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'Fork\\n'\n",
      " '0\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " ' 予尘简/ChatServer\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " '代码\\n'\n",
      " '\\n'\n",
      " 'Issues\\n'\n",
      " '\\n'\n",
      " '0\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'Pull Requests\\n'\n",
      " '\\n'\n",
      " '0\\n'\n",
      " '\\n'\n",
      " '\\n'\n",
      " 'Wiki\\n'\n",
      " '\\n'\n",
      " '统计')\n"
     ]
    }
   ],
   "source": [
    "pprint(pages[0].page_content[:500])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'language': 'zh-CN',\n",
      " 'source': 'https://gitee.com/xaioyang123/chat-server/blob/master/README.md',\n",
      " 'title': 'README.md · 予尘简/ChatServer - Gitee.com'}\n"
     ]
    }
   ],
   "source": [
    "pprint(pages[0].metadata)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
